Temporal overview

p_year %>% 
  inner_join(poems,by=c("p_id")) %>%
  count(collection,year) %>%
  mutate(measure="yearly count") %>%
  union_all(
    p_year %>% # 10 year rolling mean
      distinct(year) %>% 
      left_join(p_year %>% distinct(year),sql_on="RHS.year BETWEEN LHS.year-5 AND LHS.year+4") %>%
      inner_join(p_year,by=c("year.y"="year")) %>%
      inner_join(poems,by=c("p_id")) %>%
      group_by(collection=collection,year=year.x) %>%
      summarize(n=n()/n_distinct(year.y),.groups="drop") %>%
      mutate(measure="10 year rolling mean")
  ) %>%
  filter(collection!="literary",!year %in% c(0,9999)) %>%
  mutate(year=if_else(year>=1800,year,1780)) %>%
  group_by(collection,measure,year) %>%
  summarise(n=sum(n),.groups="drop") %>%
  collect() %>%
  complete(year,collection,measure,fill=list(n=0)) %>%
  mutate(collection=fct_relevel(str_to_upper(collection),"ERAB","SKVR","JR")) %>%
  group_by(collection,measure) %>%
  arrange(year) %>%
  filter(n!=0 | lag(n)!=0 | lead(n)!=0) %>%
  ungroup() %>%
  mutate(youtlier=n>4600,xoutlier=year<1800) %>%
  ggplot(aes(x=year,y=n,color=collection)) +
  geom_point(data=~.x %>% filter(measure=="yearly count",youtlier==FALSE),size=0.5) +
  geom_point(data=~.x %>% filter(youtlier==TRUE),aes(x=year),y=5000) +
  geom_text_repel(data=~.x %>% filter(youtlier==TRUE),aes(x=year,label=scales::number(n)),y=5000, show.legend=FALSE) +
  geom_point(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=year,y=n)) +
  geom_text_repel(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=year,y=n,label=scales::number(n)), show.legend=FALSE) +  
  geom_line(data=~.x %>% filter(xoutlier==FALSE,measure=="10 year rolling mean")) +
  theme_hsci_discrete(base_family="Arial") + 
  theme(
    legend.justification=c(0,1), 
    legend.position=c(0.02, 0.98), 
    legend.background = element_blank(), 
    legend.key=element_blank()
  ) + 
  labs(color=NULL) +
  coord_cartesian(ylim=c(0,4600),xlim=c(1800,1970),clip="off") +
  scale_y_continuous(breaks=seq(0,20000,by=1000),labels=scales::number) +
#  ylab("Poems") +
  ylab("Runojen määrä") +
  scale_x_continuous(breaks=seq(1000,2000,by=10)) +
#  xlab("Year") +
  xlab("Vuosi") +
  ggtitle("")
## Warning: Missing values are always removed in SQL aggregation functions.
## Use `na.rm = TRUE` to silence this warning
## This warning is displayed once every 8 hours.

#  ggtitle("Runojen määrä vuosittain ja kokoelmittain")
#  ggtitle("Number of poems by year and collection")
top_top_types <- p_typ %>%
  filter(!is_minor) %>%
  inner_join(poems) %>%
  inner_join(types_to_top_level_types) %>%
  count(collection, ancestor_t_id) %>%
  group_by(collection) %>%
  slice_max(n,n=9) %>%
  ungroup() %>% 
  mutate(top_type=TRUE) %>%
  select(ancestor_t_id,top_type) %>%
  compute_a(temporary=TRUE, overwrite=TRUE)
d <- p_year %>% 
  inner_join(poems,by=c("p_id")) %>%
  left_join(p_typ %>%
              filter(!is_minor) %>%
    inner_join(types_to_top_level_types %>% 
                 inner_join(types %>%
                              filter(!str_detect(type_orig_id,"^erab_orig")) %>%
                        select(ancestor_t_id=t_id,ancestor_type_name=name))) %>%
    left_join(top_top_types) %>%
    mutate(
      ancestor_type_name=if_else(!is.na(top_type),ancestor_type_name,"Muut"),
      ancestor_t_id=if_else(!is.na(top_type),ancestor_t_id,-1),
      )
  ) %>%
  replace_na(list(ancestor_type_name="Tuntematon", ancestor_t_id=-2)) %>%
  distinct(ancestor_t_id,ancestor_type_name, collection, year, p_id) %>%
  count(ancestor_t_id,ancestor_type_name, collection, year) %>%
  mutate(measure="yearly count") %>%
  union_all(
    p_year %>% # 10 year rolling mean
      distinct(year) %>% 
      left_join(p_year %>% distinct(year),sql_on="RHS.year BETWEEN LHS.year-5 AND LHS.year+4") %>%
      inner_join(p_year,by=c("year.y"="year")) %>%
      inner_join(poems,by=c("p_id")) %>%
      left_join(p_typ %>%
        inner_join(types_to_top_level_types %>% 
                     inner_join(types %>% 
                                  filter(!str_detect(type_orig_id,"^erab_orig")) %>% 
                                  select(ancestor_t_id=t_id,ancestor_type_name=name))) %>%
        left_join(top_top_types) %>%
        mutate(
          ancestor_type_name=if_else(!is.na(top_type),ancestor_type_name,"Muut"),
          ancestor_t_id=if_else(!is.na(top_type),ancestor_t_id,-1),
          )
      ) %>%
      replace_na(list(ancestor_type_name="Tuntematon", ancestor_t_id=-2)) %>%
      distinct(ancestor_t_id,ancestor_type_name, collection, year.x, year.y, p_id) %>%
      group_by(ancestor_t_id,ancestor_type_name, collection, year=year.x) %>%
      summarize(n=n()/n_distinct(year.y),.groups="drop") %>%
      mutate(measure="10 year rolling mean")
  ) %>%
  filter(collection!="literary",!year %in% c(0L,9999L)) %>%
  mutate(year=if_else(year>=1800L,year,1780L)) %>%
  group_by(ancestor_type_name, collection, measure, year) %>%
  summarise(n=sum(n),.groups="drop") %>%
  collect()
d %>%
  mutate(collection=fct_relevel(str_to_upper(collection),"SKVR","ERAB","JR")) %>%
  filter(collection=="SKVR") %>%
  complete(ancestor_type_name, year,collection,measure,fill=list(n=0)) %>%
  group_by(ancestor_type_name, collection,measure) %>%
  arrange(year) %>%
  filter(n!=0 | lag(n)!=0 | lead(n)!=0) %>%
  ungroup() %>%
  mutate(youtlier=n>1300,xoutlier=year<1800) %>%
  ggplot(aes(x=year,y=n,color=ancestor_type_name)) +
#  facet_wrap(~collection) +
  geom_point(data=~.x %>% filter(measure=="yearly count",youtlier==FALSE,xoutlier==FALSE),size=0.5) +
  geom_point(data=~.x %>% filter(youtlier==TRUE),aes(x=year),y=1400) +
  geom_text_repel(data=~.x %>% filter(youtlier==TRUE),aes(x=year,label=scales::number(n)),y=1400, show.legend=FALSE) +
  geom_point(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=1785,y=n)) +
  geom_text_repel(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=1785,y=n,label=scales::number(n)), show.legend=FALSE) +  
  geom_line(data=~.x %>% filter(xoutlier==FALSE,measure=="10 year rolling mean")) +
  theme_hsci_discrete(base_family="Arial") + 
  theme(
    legend.justification=c(0,1), 
    legend.position=c(0.02, 0.98), 
    legend.background = element_blank(), 
    legend.key=element_blank()
  ) + 
  labs(color=NULL) +
  coord_cartesian(ylim=c(0,1300),xlim=c(1800,1940),clip="off") +
  scale_y_continuous(breaks=seq(0,20000,by=500),labels=scales::number) +
#  ylab("Poems") +
  ylab("Runojen määrä") +
  scale_x_continuous(breaks=seq(1000,2000,by=10)) +
#  xlab("Year") +
  xlab("Vuosi") +
  ggtitle("")

#  ggtitle("Runojen määrä vuosittain ja kokoelmittain")
#  ggtitle("Number of poems by year and collection")
youtlier_limit = 2000
d %>%
  mutate(collection=fct_relevel(str_to_upper(collection),"SKVR","ERAB")) %>%
  filter(collection=="ERAB") %>%
  complete(ancestor_type_name, year,collection,measure,fill=list(n=0)) %>%
  group_by(ancestor_type_name, collection,measure) %>%
  arrange(year) %>%
  filter(n!=0 | lag(n)!=0 | lead(n)!=0) %>%
  ungroup() %>%
  mutate(youtlier=n>youtlier_limit,xoutlier=year<1800) %>%
  mutate(ancestor_type_name=case_match(ancestor_type_name,
    "Laulud noorrahva elust" ~ "Laulut nuorison elämästä (Laulud noorrahva elust)",
    "Muut" ~ "Muut (sisältää 17 luokkaa)",
    "Laulud meelelahutamiseks" ~ "Viihdytyslaulut (Laulud meelelahutamiseks)",
    "Lüroeepilised laulud" ~ "Lyroeeppiset laulut (Lüroeepilised laulud)",
    "Laulud laulust" ~ "Laulut laulusta (Laulud laulust)",
    "Töölaulud" ~ "Työlaulut (Töölaulud)",
    "Looduslaulud" ~ "Laulut luonnosta (Looduslaulud)",
    "Laulud ühiskondlikest vahekordadest" ~ "Laulut yhteiskunnallisista suhteista\n(Laulud ühiskondlikest vahekordadest)",
    "Murelaulud" ~ "Huolilaulut (Murelaulud)",
    "Laulud abielust" ~ "Laulut avioelämästä (Laulud abielust)",
    "Kalendrilaulud" ~ "Kalendaarilaulut (Kalendrilaulud)",
    .default=ancestor_type_name
  )) %>%
  mutate(ancestor_type_name=fct_reorder(ancestor_type_name,n,.fun=sum,.desc=TRUE)) %>%
  mutate(ancestor_type_name=fct_relevel(ancestor_type_name, "Muut (sisältää 17 luokkaa)", after=Inf)) %>%
  ggplot(aes(x=year,y=n,color=ancestor_type_name)) +
#  facet_wrap(~collection) +
  geom_point(data=. %>% filter(measure=="yearly count",youtlier==FALSE,xoutlier==FALSE),size=0.5) +
  geom_point(data=. %>% filter(youtlier==TRUE),aes(x=year),y=youtlier_limit+100) +
  geom_text_repel(data=. %>% filter(youtlier==TRUE),aes(x=year,label=scales::number(n)),y=youtlier_limit+100, show.legend=FALSE) +
  geom_point(data=. %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=1785,y=n)) +
  geom_text_repel(data=. %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=1785,y=n,label=scales::number(n)), show.legend=FALSE) +  
  geom_line(data=. %>% filter(xoutlier==FALSE,measure=="10 year rolling mean")) +
  theme_hsci_discrete(base_family="Arial") + 
  theme(
    legend.justification=c(0,1), 
    legend.position=c(0.02, 0.98), 
    legend.background = element_blank(), 
    legend.key=element_blank()
  ) + 
  labs(color=NULL) +
  coord_cartesian(ylim=c(0,youtlier_limit),xlim=c(1800,1950),clip="off") +
  scale_y_continuous(breaks=seq(0,20000,by=200),labels=scales::number) +
#  ylab("Poems") +
  ylab("Runojen määrä") +
#  guides(color=guide_legend(nrow=2)) +
  scale_x_continuous(breaks=seq(1000,2000,by=10)) +
#  xlab("Year") +
  xlab("Vuosi") +
  ggtitle("")

#  ggtitle("Runojen määrä vuosittain ja kokoelmittain")
#  ggtitle("Number of poems by year and collection")
d %>%
  mutate(collection=fct_relevel(str_to_upper(collection),"SKVR","ERAB","JR")) %>%
  filter(collection=="JR") %>%
  complete(ancestor_type_name, year,collection,measure,fill=list(n=0)) %>%
  group_by(ancestor_type_name, collection,measure) %>%
  arrange(year) %>%
  filter(n!=0 | lag(n)!=0 | lead(n)!=0) %>%
  ungroup() %>%
  mutate(youtlier=n>6500,xoutlier=year<1800) %>%
  ggplot(aes(x=year,y=n,color=ancestor_type_name)) +
#  facet_wrap(~collection) +
  geom_point(data=~.x %>% filter(measure=="yearly count",youtlier==FALSE),size=0.5) +
  geom_point(data=~.x %>% filter(youtlier==TRUE),aes(x=year),y=7200) +
  geom_text_repel(data=~.x %>% filter(youtlier==TRUE),aes(x=year,label=scales::number(n)),y=7200, show.legend=FALSE) +
  geom_point(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=year,y=n)) +
  geom_text_repel(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=year,y=n,label=scales::number(n)), show.legend=FALSE) +  
  geom_line(data=~.x %>% filter(xoutlier==FALSE,measure=="10 year rolling mean")) +
  theme_hsci_discrete(base_family="Arial") + 
  theme(
    legend.justification=c(0,1), 
    legend.position=c(0.02, 0.98), 
    legend.background = element_blank(), 
    legend.key=element_blank()
  ) + 
  labs(color=NULL) +
  coord_cartesian(ylim=c(0,6500),xlim=c(1800,1960),clip="off") +
  scale_y_continuous(breaks=seq(0,20000,by=1000),labels=scales::number) +
#  ylab("Poems") +
  ylab("Runojen määrä") +
  scale_x_continuous(breaks=seq(1000,2000,by=10)) +
#  xlab("Year") +
  xlab("Vuosi") +
  ggtitle("")

#  ggtitle("Runojen määrä vuosittain ja kokoelmittain")
#  ggtitle("Number of poems by year and collection")
p_year %>% 
  filter(year %in% c(0,9999)) %>% 
  left_join(poems) %>% 
  count(collection,year) %>%
  ungroup() %>%
  gt() %>%
  tab_header(title="Abnormal years") %>%
  fmt_integer(n)
Abnormal years
collection year n
skvr 9999 469
erab 0 6,670

Overview of collectors

poems %>% 
  distinct(collection) %>%
  pull() %>%
  map(~p_col %>% 
    inner_join(poems %>% filter(collection==.x),by=c("p_id")) %>%
    count(col_id) %>%
    left_join(collectors,by=c("col_id")) %>%
    select(col_id,name,n) %>%
    collect() %>%
    mutate(col_id=fct_reorder(str_c(col_id,"|",name),n)) %>%
    mutate(col_id=fct_lump_n(col_id,n=100,w=n)) %>%
    mutate(col_id=fct_relevel(col_id,"Other")) %>%
    group_by(col_id) %>%
    tally(wt=n) %>% {
      ggplot(.,aes(x=col_id,y=n)) +
      geom_col() +
      geom_text(aes(label=p(n)),hjust='left',nudge_y = 100) +
      theme_hsci_discrete(base_family="Arial") +
      coord_flip() +
      labs(title=str_c("Collectors in ",.x))
    }
  )
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `col_id = fct_relevel(col_id, "Other")`.
## Caused by warning:
## ! 1 unknown level in `f`: Other
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

p_col %>% 
  anti_join(collectors) %>%
  count(col_id) %>%
  gt() %>%
  tab_header(title="Collectors without a name") %>%
  fmt_integer(n)
Collectors without a name
col_id n
7649 93
7650 1
p_col %>%
  inner_join(collectors) %>%
  inner_join(poems) %>%
  filter(collection!="literary") %>%
  mutate(collection=str_to_upper(collection)) %>%
  count(collection,col_id,name) %>%
  group_by(collection) %>%
  slice_max(order_by=n,n=10) %>%
  ungroup() %>%
  select(-col_id) %>%
  gt(groupname_col = "collection", rowname_col = "name") %>%
  row_group_order(c("ERAB","SKVR","JR")) %>%
  fmt_integer(n,sep_mark=" ")
n
ERAB
Rosenstrauch, Karl Voldemar 3 825
Viljak, Karl 3 225
Ostrov, Mihkel 2 300
Vilberg (Vilbaste), Gustav 2 218
Viidalepp (Viidebaum), Richard 2 134
Kallas, Oskar Philipp 1 723
Seen, Gustav 1 473
Penna, Peeter 1 312
Tampere, Herbert 1 188
koguja teadmata 1 175
SKVR
Krohn, Kaarle 4 116
Paulaharju, Samuli ja Jenny 3 157
Alava, Vihtori 3 089
Europaeus, D. E. D. 2 899
Neovius, A. D. 2 535
Porkka, Volmari 2 424
Lönnrot, Elias 2 402
Perä-Pohjolan ja Lapin Kotiseutuyhdistys 2 172
Salminen, Väinö 1 761
Vihervaara, Eemeli 1 702
JR
Hämeenlinnan alakansakouluseminaari 6 782
Perä-Pohjolan ja Lapin Kotiseutuyhdistys 3 463
Kärki, Frans 3 157
Railonsala, Artturi 2 452
Paulaharju, Samuli ja Jenny 2 168
Sääski, Sylvi 1 666
Saarijärven yhteiskoulu 1 652
Pennanen, Olavi 1 448
Paavolainen, Oma Martti 1 265
Lönnrot, Elias 1 239
p_col %>%
  inner_join(collectors) %>%
  inner_join(poems) %>%
  filter(collection!="literary") %>%
  mutate(collection=str_to_upper(collection)) %>%
  count(collection,col_id,name) %>%
  group_by(collection) %>%
  slice_max(order_by=n,n=10) %>%
  ungroup() %>%
  inner_join(p_col) %>%
  inner_join(p_typ) %>%
  inner_join(types_to_top_level_types) %>%
  count(collection, col_id, t_id=ancestor_t_id) %>%
  inner_join(types %>% rename(type_name=name)) %>%
  inner_join(collectors) %>%
  filter(collection=="SKVR") %>%
  collect() %>% 
#  group_by(collection) %>%
#  mutate(type_name=fct_lump_n(type_name, n, n=5, other_level="Muut")) %>%
#  ungroup() %>%
  group_by(col_id) %>%
  mutate(tn=sum(n)) %>%
  ungroup() %>%
  mutate(name=fct_reorder(name,tn)) %>%
  ggplot(aes(x=name,fill=type_name,y=n)) +
#  facet_wrap(~collection,scales="free",ncol=1) +
  geom_col() +
  theme_hsci_discrete() +
  coord_flip() +
  scale_y_continuous(labels=scales::number) +
  labs(fill="Päätyyppi") +
  xlab("Kerääjä") +
  ylab("Tyyppimerkintöjä")

Geographical overview

d <- p_pl %>% 
  inner_join(poems %>% filter(collection!="literary")) %>%
  count(collection, pl_id) %>% 
  inner_join(places) %>%
  select(collection, pl_id, name,n) %>%
  collect()

poems_without_location <- poems %>% 
  filter(collection!="literary") %>%
  anti_join(p_pl, join_by(p_id)) %>% 
  count() %>% 
  pull()

unprojected_locations <- d %>%
  anti_join(polygons, join_by(pl_id)) %>%
  add_row(name=NA,n=poems_without_location)

best_polygons <- polygons %>% 
  filter(is_primary==T) %>% 
  group_by(pl_id) %>% 
  filter(map_id==min(map_id)) %>%
  ungroup()
tm_shape(polygons %>%
  filter(map_id==0)) +
  tm_polygons() +
  tm_shape(best_polygons %>%
    inner_join(d, join_by(pl_id)), point.per="largest") +
  tm_bubbles(col='n', size='n', id='name', style='fisher', palette='plasma') +
  tm_layout(title=str_c("Geographical overview. Missing ",unprojected_locations %>% tally(wt=n) %>% pull() %>% p," poems.")) +
  tm_facets(by="collection", ncol=1)

library(SpatialKDE)
md <- best_polygons %>%
    inner_join(d, join_by(pl_id)) 
rp <- md %>%
  group_by(collection) %>%
  group_map(~st_sample(.x,size=.x %>% pull(n), progress=TRUE) %>% st_sf())
names(rp) = md %>%
  group_by(collection) %>% 
  group_keys() %>% pull()
library(mappp)
cell_size <- 15000
band_width <- 15000
grid <- md %>% create_grid_hexagonal(cell_size = cell_size, side_offset = band_width)
kde2 <- rp %>% mappp(~kde(.x, band_width = band_width,kernel = "quartic", grid = grid), parallel=TRUE)
## 
  |                                          |   0%, ETA NA
kde2 %>% imap(~
tm_shape(.x %>%
  st_intersection(best_polygons)) +
  tm_fill(col = "kde_value", palette = "plasma", style="fisher", title = "KDE Estimate") +
  tm_layout(title=.y))
## Warning: attribute variables are assumed to be spatially constant throughout
## all geometries

## Warning: attribute variables are assumed to be spatially constant throughout
## all geometries

## Warning: attribute variables are assumed to be spatially constant throughout
## all geometries
## $erab

## 
## $jr

## 
## $skvr

Poem locations not mapped

unprojected_locations %>%
  arrange(desc(n)) %>%
  gt() %>%
  tab_header("Poem locations not mapped") %>%
  fmt_integer(n)
Poem locations not mapped
collection pl_id name n
NA NA NA 4,369
erab 776 välismaa 1,840
erab 875 Tartu 1,028
erab 811 Tallinn 757
erab 894 Viljandi l. 662
erab 891 Pärnu l. 611
erab 889 Narva l. 495
erab 892 Rakvere l. 319
erab 897 Valga 151
erab 890 Paide l. 129
erab 887 Haapsalu 86
erab 895 Võru l. 68
erab 778 Valgamaa 58
erab 766 Virumaa 44
erab 893 Sõrve 40
erab 888 Hiiumaa 23
erab 896 Kuressaare l. 20
erab 898 Tapa l. 13
erab 899 Nõmme l. 8
skvr 667 Suma 6
jr 679 Pietari 2
skvr 607 Uuniemi 1
jr 609 Pohjois-Varanki 1

Geographical overview by collection

d <- p_pl %>% 
  inner_join(poems %>% filter(collection!="literary")) %>%
  count(collection, pl_id) %>% 
  inner_join(places) %>%
  select(collection, pl_id, name,n) %>%
  collect()

poems_without_location <- poems %>% 
  filter(collection!="literary") %>%
  anti_join(p_pl, join_by(p_id)) %>% 
  count() %>% 
  pull()

unprojected_locations <- d %>%
  anti_join(polygons, join_by(pl_id)) %>%
  add_row(name=NA,n=poems_without_location)

best_polygons <- polygons %>% 
  filter(is_primary==T) %>% 
  group_by(pl_id) %>% 
  filter(map_id==min(map_id)) %>%
  ungroup()
poems %>% 
  distinct(collection) %>%
  pull() %>%
  map(~
    tm_shape(
      polygons %>%
        left_join(
          p_pl %>% 
            inner_join(poems %>% filter(collection==.x),by=c("p_id")) %>%
            count(pl_id) %>% 
            inner_join(places) %>%
            select(pl_id,n) %>%
            collect()
        )
    ) +
    tm_polygons(col='n', id='name', style='fisher', palette='plasma') +
    tm_layout(title=str_c("Geography of ",.x,". Missing ",unprojected_locations %>% filter(collection==.x) %>% tally(wt=n) %>% pull() %>% p," poems."))
  )
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

Poem locations not mapped by collection

poems %>% 
  distinct(collection) %>%
  pull() %>%
  map(~
    unprojected_locations %>%
      filter(collection==.x) %>%
      arrange(desc(n)) %>%
      select(-collection) %>%
      gt() %>%
      tab_header(str_c("Poem locations not mapped in ",.x)) %>%
      fmt_integer(n)
  )
[[1]]
Poem locations not mapped in skvr
pl_id name n
667 Suma 6
607 Uuniemi 1
[[2]]
Poem locations not mapped in erab
pl_id name n
776 välismaa 1,840
875 Tartu 1,028
811 Tallinn 757
894 Viljandi l. 662
891 Pärnu l. 611
889 Narva l. 495
892 Rakvere l. 319
897 Valga 151
890 Paide l. 129
887 Haapsalu 86
895 Võru l. 68
778 Valgamaa 58
766 Virumaa 44
893 Sõrve 40
888 Hiiumaa 23
896 Kuressaare l. 20
898 Tapa l. 13
899 Nõmme l. 8
[[3]]
Poem locations not mapped in jr
pl_id name n
679 Pietari 2
609 Pohjois-Varanki 1
[[4]]
Poem locations not mapped in literary
pl_id name n

Informants

raw_meta %>% 
  filter(field=="INF") %>%
  mutate(value_c=str_replace_all(value,"^\\s*[A-Za-zÅÄÖåäö][a-zåäö][a-zåäö]+\\.","")) %>% 
  mutate(name=str_replace_all(value_c,"\\s*\"*([A-Za-zÅÄÖåäö]?[a-zåäö]?[a-zåäö]?\\.?[^;.,]+)(.|\\n)*","\\1")) %>% 
  group_by(name) %>% 
  summarise(origs=str_flatten(sql("distinct value"),collapse="|"),n=n(),.groups="drop") %>% 
  collect() %>%
  arrange(desc(n))
## # A tibble: 15,276 × 3
##    name                                                              origs     n
##    <chr>                                                             <chr> <int>
##  1 ""                                                                "\n … 30419
##  2 "Elias Lönnrotin eri lähteistä ja omilta keruiltaan kokoama sana… "Eli…  7055
##  3 "Larin Paraske"                                                   "\n …  1727
##  4 "Elias Lönnrotin kokoamia"                                        "Eli…   883
##  5 "Kirjoituksia suomalaisesta kulttuurista ja runoudesta"           "Kir…   721
##  6 "Ańńi Lehtońi"                                                    "\n …   456
##  7 "Julkaistu sananlaskukokoelma"                                    "Jul…   282
##  8 "Lukeri Melikova"                                                 "\n …   185
##  9 "Kirkonkylän kk:n oppilaat"                                       "\nK…   182
## 10 "Liedakkalan kk:n oppilaat"                                       "\nL…   180
## # ℹ 15,266 more rows

Poem types

poems %>% 
  filter(collection!="literary") %>%
  left_join(p_typ %>% filter(is_minor==0) %>% inner_join(types %>% mutate(type_type=if_else(str_detect(t_id,"^erab_orig"),"Non-unified","Unified")))) %>% 
  group_by(collection,p_id) %>%
  summarise(type_type=case_when(
    any(type_type=="Unified") ~ "Systematisoituja", 
    any(type_type=="Non-unified") ~ "Vain ei-systematisoituja", 
    T ~ "Ei annotointeja"), .groups="drop") %>%
  count(collection,type_type) %>%
  collect() %>%
  mutate(collection=fct_rev(fct_relevel(str_to_upper(collection),"ERAB","SKVR","JR")),type_type=fct_rev(fct_relevel(type_type,"Systematisoituja","Vain ei-systematisoituja","Ei annotointeja"))) %>%
  ggplot(aes(x=collection,y=n,fill=type_type)) + 
  geom_col() + 
  theme_hsci_discrete() + 
  xlab("Kokoelma") +
  ylab("Runoja") +
  labs(fill="Runotyyppiannotaatiot") +
  theme(legend.position="bottom") +
  guides(fill = guide_legend(reverse = TRUE)) +
  scale_y_continuous(labels=scales::number) +
  coord_flip()
## Warning: Missing values are always removed in SQL aggregation functions.
## Use `na.rm = TRUE` to silence this warning
## This warning is displayed once every 8 hours.
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `type_type = fct_rev(...)`.
## Caused by warning:
## ! 1 unknown level in `f`: Vain ei-systematisoituja

Spatiotemporal overview

d <- poems %>%
  left_join(p_year %>% mutate(year=if_else(year %in% c(0L,9999L),NA,year))) %>% 
  collect() %>%
  mutate(year_ntile=ntile(year,11)) %>%
  group_by(year_ntile) %>%
  mutate(years=str_c(min(year),"-",max(year))) %>%
  ungroup() %>%
  left_join(p_pl %>% collect()) %>% 
  count(years,pl_id) %>% 
  ungroup() %>%
  left_join(places %>% select(pl_id,name) %>% collect())
best_polygons %>% 
  left_join(d %>% complete(pl_id,years), join_by(pl_id)) %>%
  tm_shape() +
  tm_polygons(col='n', id='name', style='fisher', palette='plasma') +
  tm_layout(main.title="Geographical overviews by time",legend.outside.size=0.1) +
  tm_facets(by="years",ncol=4)

Poem length statistics

By collection

poem_stats %>%
  filter(nverses<=75) %>%
  inner_join(poems) %>%
  count(collection,nverses) %>%
  ungroup() %>%
  ggplot(aes(x=nverses,y=n)) +
  geom_col(width=1) +
  facet_wrap(~collection,scales="free_y") +
  theme_hsci_discrete(base_family="Arial") +
  scale_y_continuous(labels=scales::comma_format()) +
  xlab("Number of verse lines") +
  ylab("Poems") +
  labs(title="Number of verse lines")

poem_stats %>%
  inner_join(poems) %>%
  count(collection,nverses) %>%
  ungroup() %>%
  group_by(collection) %>%
  mutate(prop=n/sum(n)) %>%
  ungroup() %>%
  filter(nverses<=75) %>%
  ggplot(aes(x=nverses,y=collection,fill=collection,height=prop)) +
  geom_density_ridges(stat='identity') +
  theme_hsci_discrete(base_family="Arial") +
#  scale_y_continuous(labels=scales::percent_format()) +
  xlab("Number of verse lines") +
  ylab("Poems") +
  labs(title="Number of verse lines")

Poems with more than 75 verse lines

poem_stats %>%
  inner_join(poems) %>%
  count(collection,nverses) %>%
  mutate(nl=if_else(nverses>75,n,0L)) %>%
  group_by(collection) %>%
  summarise(lines=sum(nl),proportion=sum(nl)/sum(n),.groups="drop") %>%
  arrange(desc(lines)) %>%
  gt() %>%
  tab_header(title="Poems with more than 75 verse lines") %>%
  fmt_integer(lines) %>%
  fmt_percent(proportion)
Poems with more than 75 verse lines
collection lines proportion
skvr 2,370 2.66%
erab 2,122 1.95%
jr 1,308 1.53%
literary 395 3.75%

By county

poem_stats %>% 
  left_join(p_pl %>% inner_join(places_to_counties) %>% inner_join(places, join_by(county_id==pl_id))) %>%
  count(name,nverses) %>%
  ungroup() %>%
  group_by(name) %>%
  mutate(prop=n/sum(n)) %>%
  ungroup() %>%
  filter(nverses<=40,name!="Ahvenanmaa") %>%
  collect() %>%
  mutate(name=fct_reorder(name,prop,.fun=max)) %>%
  ggplot(aes(x=nverses,y=name,height=prop)) +
  geom_density_ridges(stat='identity') +
  theme_hsci_continuous(base_family="Arial") +
#  scale_y_continuous(labels=scales::percent_format()) +
  xlab("Number of verse lines in poem") +
  ylab("Poems") +
  guides(fill="none") +
  labs(title="Number of verse lines by county")

Poem verse statistics

By collection

Line types

d <- verses %>% 
  left_join(verse_poem) %>% 
  left_join(poems) %>% 
  count(collection,type) %>% 
  ungroup() %>%
  arrange(collection,desc(n)) %>%
  collect()
d %>% 
  group_by(collection) %>%
  mutate(proportion=n/sum(n)) %>%
  gt() %>%
  fmt_integer(n) %>%
  fmt_percent(proportion)
type n proportion
skvr
V 1,340,799 94.63%
L 44,296 3.13%
CPT 27,869 1.97%
K 3,931 0.28%
erab
V 2,006,540 92.77%
PAG 58,218 2.69%
L 25,401 1.17%
TYH 25,303 1.17%
CPT 22,807 1.05%
REF 20,021 0.93%
LRY 4,258 0.20%
RRE 307 0.01%
MRK 52 0.00%
U 38 0.00%
LLI 2 0.00%
TYP 1 0.00%
jr
V 821,809 90.99%
L 49,650 5.50%
CPT 28,118 3.11%
K 3,595 0.40%
literary
V 198,691 93.49%
L 8,297 3.90%
K 2,815 1.32%
CPT 2,727 1.28%

Verse line lengths

d_nr_characters <- verses_cl %>%
  mutate(nr_characters=str_length(text)) %>%
  left_join(verse_poem) %>% 
  left_join(poems) %>% 
  count(collection,nr_characters) %>% 
  ungroup() %>%
  arrange(collection,desc(n)) %>%
  collect()

d_nr_words <- word_occ %>%
  group_by(v_id) %>%
  summarise(nr_words=max(pos),.groups="drop") %>%
  left_join(verse_poem) %>%
  left_join(poems) %>% 
  count(collection,nr_words) %>% 
  ungroup() %>%
  arrange(collection,desc(n)) %>%
  collect()

Verse line lengths in characters

d_nr_characters %>% 
  filter(nr_characters<=60) %>%
  ggplot(aes(x=nr_characters,y=n)) +
  geom_col(width=1) +
  facet_wrap(~collection,scales="free_y") +
  theme_hsci_discrete(base_family="Arial") +
  scale_y_continuous(labels=scales::comma_format()) +
  xlab("Number of characters") +
  ylab("Verses") +
  labs(title="Number of characters in verse lines")

d_nr_characters %>% 
  group_by(collection) %>%
  mutate(prop=n/sum(n)) %>%
  ungroup() %>%
  filter(nr_characters<=60) %>%
  ggplot(aes(x=nr_characters,y=collection,fill=collection,height=prop)) +
  geom_density_ridges(stat='identity') +
  theme_hsci_discrete(base_family="Arial") +
#  scale_y_continuous(labels=scales::percent_format()) +
  xlab("Number of characters") +
  ylab("Verses") +
  labs(title="Number of characters in verse lines")

Verse lines with more than 60 characters

d_nr_characters %>% 
  mutate(nl=if_else(nr_characters>60,n,0L)) %>%
  group_by(collection) %>%
  summarise(lines=sum(nl),proportion=sum(nl)/sum(n),.groups="drop") %>%
  arrange(desc(lines)) %>%
  gt() %>%
  tab_header(title="Verse lines with more than 60 characters") %>%
  fmt_integer(lines) %>%
  fmt_percent(proportion)
Verse lines with more than 60 characters
collection lines proportion
jr 1,910 0.23%
skvr 761 0.06%
literary 733 0.37%
erab 350 0.02%

Verse line lengths in words

d_nr_words %>% 
  filter(nr_words<=10) %>%
  ggplot(aes(x=nr_words,y=n)) +
  geom_col(width=1) +
  facet_wrap(~collection,scales="free_y") +
  scale_x_continuous(breaks=seq(0,10,by=2)) +
  scale_y_continuous(labels=scales::comma_format()) +
  theme_hsci_discrete(base_family="Arial") +
  xlab("Number of words") +
  ylab("Verses") +
  labs(title="Number of words in verse lines")

d_nr_words %>% 
  filter(nr_words<=10) %>%
  uncount(n) %>%
  ggplot(aes(x=nr_words,y=collection,fill=collection)) +
  stat_binline(binwidth=1) +
  theme_hsci_discrete(base_family="Arial") +
  scale_x_continuous(breaks=seq(0,10,by=2)) +
  xlab("Number of words") +
  ylab("Verses") +
#  scale_y_continuous(labels=scales::percent_format()) +
  labs(title="Number of words in verse lines")

Verse lines with more than 10 words

d_nr_words %>% 
  mutate(nl=if_else(nr_words>10,n,0L)) %>%
  group_by(collection) %>%
  summarise(lines=sum(nl),proportion=sum(nl)/sum(n),.groups="drop") %>%
  arrange(desc(lines)) %>%
  gt() %>%
  tab_header(title="Verse lines with more than 10 words") %>%
  fmt_integer(lines) %>%
  fmt_percent(proportion)
Verse lines with more than 10 words
collection lines proportion
jr 839 0.11%
skvr 497 0.04%
literary 341 0.17%
erab 306 0.02%
verse_nr_words <- word_occ %>% 
  group_by(v_id) %>%
  summarise(nr_words=max(pos)) %>%
  compute_a(unique_indexes=list(c("v_id","nr_words")))

word_nr_characters <- words %>%
  mutate(nr_characters=str_length(text)) %>%
  select(w_id,nr_characters) %>%
  compute_a(unique_indexes=list(c("w_id","nr_characters")))

d <- word_occ %>%
  left_join(word_nr_characters) %>%
  left_join(verse_nr_words) %>%
  left_join(verse_poem %>% select(-pos),by=c("v_id")) %>% 
  left_join(poems) %>% 
  count(collection,nr_words,pos,nr_characters) %>%
  collect()

By county

Verse line lengths

d_nr_characters <- verses_cl %>%
  mutate(nr_characters=str_length(text)) %>%
  left_join(verse_poem) %>% 
  left_join(p_pl %>% inner_join(places_to_counties) %>% inner_join(places, join_by(county_id==pl_id))) %>%
  count(name,nr_characters) %>% 
  ungroup() %>%
  arrange(name,desc(n)) %>%
  collect()

d_nr_words <- word_occ %>%
  group_by(v_id) %>%
  summarise(nr_words=max(pos),.groups="drop") %>%
  left_join(verse_poem) %>%
  left_join(p_pl %>% inner_join(places_to_counties) %>% inner_join(places, join_by(county_id==pl_id))) %>%
  count(name,nr_words) %>% 
  ungroup() %>%
  arrange(name,desc(n)) %>%
  collect()
## Warning: Missing values are always removed in SQL aggregation functions.
## Use `na.rm = TRUE` to silence this warning
## This warning is displayed once every 8 hours.

Verse line lengths in characters

d_nr_characters %>% 
  group_by(name) %>%
  mutate(prop=n/sum(n)) %>%
  ungroup() %>%
  filter(nr_characters<=40,name!="Ahvenanmaa") %>%
  mutate(name=fct_reorder(name,prop,.fun=max)) %>%
  ggplot(aes(x=nr_characters,y=name,height=prop)) +
  geom_density_ridges(stat='identity') +
  theme_hsci_discrete(base_family="Arial") +
#  scale_y_continuous(labels=scales::percent_format()) +
  xlab("Number of characters") +
  ylab("Verses") +
  labs(title="Number of characters in verse lines")

Verse line lengths in words

d_nr_words %>% 
  filter(nr_words<8,name!="Ahvenanmaa") %>%
  mutate(name=fct_reorder(name,n,.fun=max)) %>%
  uncount(n) %>%
  ggplot(aes(x=nr_words,y=name)) +
  stat_binline(binwidth=1,scale=0.9) +
  theme_hsci_discrete(base_family="Arial") +
  scale_x_continuous(breaks=seq(0,10,by=2)) +
  xlab("Number of words") +
  ylab("Verses") +
#  scale_y_continuous(labels=scales::percent_format()) +
  labs(title="Number of words in verse lines")

Number of characters in words by their position

verse_nr_words <- word_occ %>% 
  group_by(v_id) %>%
  summarise(nr_words=max(pos)) %>%
  compute_a(unique_indexes=list(c("v_id","nr_words")))
## Warning: Missing values are always removed in SQL aggregation functions.
## Use `na.rm = TRUE` to silence this warning
## This warning is displayed once every 8 hours.
word_nr_characters <- words %>%
  mutate(nr_characters=str_length(text)) %>%
  select(w_id,nr_characters) %>%
  compute_a(unique_indexes=list(c("w_id","nr_characters")))

d <- word_occ %>%
  left_join(word_nr_characters) %>%
  left_join(verse_nr_words) %>%
  left_join(verse_poem %>% select(-pos),by=c("v_id")) %>% 
  left_join(poems) %>% 
  count(collection,nr_words,pos,nr_characters) %>%
  collect()
d %>%
  group_by(collection,nr_words,pos) %>%
  mutate(prop=n/sum(n)) %>%
  ungroup() %>%
  filter(nr_words>=2L,nr_words<=5L) %>%
  mutate(nr_words=as_factor(nr_words),pos=as_factor(pos)) %>%
  uncount(n) %>%
  ggplot(aes(x=nr_characters,y=nr_words,fill=pos)) +
  stat_binline(binwidth=1) +
  facet_grid(collection~pos,labeller = labeller(pos=label_both)) + 
  xlab("Number of characters in word") +
  ylab("Number of words in verse") +
  labs(
    title="Number of characters in words by their position",
    subtitle="According to length of verse and collection"
    ) +
  guides(fill="none") +
  theme_hsci_discrete(base_family="Arial")

d %>%
  group_by(collection,nr_words,pos) %>%
  mutate(prop=n/sum(n)) %>%
  ungroup() %>%
  filter(nr_words>=2L,nr_words<=5L) %>%
  mutate(nr_words=as_factor(nr_words),pos=as_factor(pos)) %>%
  uncount(n) %>%
  ggplot(aes(x=nr_characters,y=pos,fill=nr_words)) +
  stat_binline(binwidth=1) +
  facet_grid(collection~nr_words,labeller = labeller(nr_words=label_both)) + 
  xlab("Number of characters in word") +
  ylab("Position") +
  labs(
    title="Number of characters in words by their position",
    subtitle="According to length of verse and collection"
    ) +
  guides(fill="none") +
  theme_hsci_discrete(base_family="Arial")